setwd("~/Dropbox/Papers/Bilingual ballot designs")

## STM ##
library(quanteda)
library(stm)
library(tidyverse)
library(readxl)
library(stringr)


data <- read_excel("Bilingual Ballots_US Sample.xlsx")
data <- data[-1,]
data <- data[complete.cases(data$Q3)&complete.cases(data$Q2)&complete.cases(data$Q9),]
data$Q3 <- str_replace_all(data$Q3, "â€™", "'") #fixing some weird stuff
# with apostrophes
data$republican <- ifelse(data$Q9=="Republican", "Republican", "Non-Republican")

exp_corpus <- corpus(data, text_field = "Q3") 

my_tokens <- tokens(exp_corpus,
                       what = "word",
                       remove_punct = TRUE,
                       remove_symbols = TRUE,
                       remove_numbers = TRUE,
                       remove_url = TRUE,
                       remove_separators = TRUE)
# lowercase the terms
doc_term_matrix <- dfm(my_tokens,
                       tolower = TRUE)
# remove stopwords
doc_term_matrix  <- dfm_select(doc_term_matrix, 
                               pattern = c(stopwords("english"), "a", "b", "c"),
                               selection = "remove")
doc_term_matrix <- dfm_wordstem(doc_term_matrix)
doc_term_matrix <- dfm_trim(doc_term_matrix,
                            min_termfreq = 10,
                            max_docfreq = 700)

# --- convert our quanteda DFM to STM's format
exp_out <- quanteda::convert(doc_term_matrix, to = "stm") # "out" is simply the way STM refers to its core collection of data
exp_docs <- exp_out$documents # pull out the indexed terms that appear in each document
exp_vocab <- exp_out$vocab # pull out unique terms
exp_meta <- exp_out$meta # pull out metadata (i.e. docvars, i.e. covariates)


stm_fit <- stm(documents = exp_out$documents, # documents
               vocab = exp_out$vocab,  # terms
               K = 6, # number of topics
               prevalence =~ Q2 + republican, # big difference vs LDA: a regression equation that models prevalence by actor, alliance, and year covariates
               data = exp_meta, 
               init.type = "Spectral", # could also choose "LDA" here, but Spectral is faster
               seed = 123) # set your seed for replicability, results can change by seed

summary(stm_fit)

plot.STM(stm_fit,
         type="summary",
         labeltype = "frex",
         n = 6)

effects_est <- estimateEffect(1:6 ~ Q2 + republican, # our regression specification
                                stm_fit, # our fitted STM
                                meta = exp_meta) # our metadata saved above when we converted from quanteda
summary(effects_est)

par(mfrow=c(1,1),  cex=1.5, font.lab=2, font.axis=3, mar = c(5, 10, 4, 2) + .1)

customlabels <- c("1: easiest, state, texas, believe",
                  "2: easy, simple, follow, prefer",
                  "3: vote, people, help, speak",
                  "4: translate, design, space, together",
                  "5: separated, two, easier, column",
                  "6: organized, less, one, see")

plot.estimateEffect(effects_est, 
                    covariate = "Q2",
                    model = stm_fit, 
                    method = "difference",
                    topics = c(1:6),
                    cov.value1 = "Ballot B: English-Spanish bilingual stacked together", 
                    cov.value2 = "Ballot A: English only monolingual",
                    xlab = "Estimated Difference in Topic Proportion: 
                    Stacked vs Monolingual",
                    verbose.labels=F,
                    custom.labels = customlabels,
                    labeltype="custom")

plot.estimateEffect(effects_est, 
                    covariate = "Q2",
                    model = stm_fit, 
                    method = "difference",
                    topics = c(1:6),
                    cov.value1 = "Ballot C: English-Spanish bilingual separated into columns", 
                    cov.value2 = "Ballot A: English only monolingual",
                    xlab = "Estimated Difference in Topic Proportion: 
                    Separated vs Monolingual",
                    verbose.labels=F,
                    custom.labels = customlabels,
                    labeltype="custom")

plot.estimateEffect(effects_est, 
                    covariate = "Q2",
                    model = stm_fit, 
                    method = "difference",
                    topics = c(1:6),
                    cov.value1 = "Ballot B: English-Spanish bilingual stacked together", 
                    cov.value2 = "Ballot C: English-Spanish bilingual separated into columns",
                    xlab = "Estimated Difference in Topic Proportion,
                    Stacked vs Separated",
                    verbose.labels=F,
                    custom.labels = customlabels,
                    labeltype="custom")

plot.estimateEffect(effects_est, 
                    covariate = "republican",
                    model = stm_fit, 
                    method = "difference",
                    topics = c(1:6),
                    cov.value1 = "Republican", 
                    cov.value2 = "Non-Republican",
                    xlab = "Estimated Difference in Topic Proportion,
                    Republican vs Non-Republican",
                    verbose.labels=F,
                    custom.labels = customlabels,
                    labeltype="custom")



